Packages used to create the below figures:
library(tidyverse)
library(knitr)
library(kableExtra)
library(plotly)
library(lubridate)
library(readxl)
library(DT)
library(ggcorrplot)
Load partial volume corrected regional tau-PET data, as downloaded from ADNI:
tau.df <- read.csv("../../ADNI_Data/Raw_Data/UCBERKELEYAV1451_PVC_05_12_20.csv", stringsAsFactors = T)
tau.df$EXAMDATE <- as.Date(as.character(tau.df$EXAMDATE), format="%m/%d/%Y")
Filter tau data to contain only subjects with 2+ tau-PET scans, and omit irrelevant columns:
tau.df <- tau.df %>%
select(-VISCODE, -update_stamp, -HEMIWM_SUVR, -BRAAK12_SUVR,
-BRAAK34_SUVR, -BRAAK56_SUVR, -OTHER_SUVR) %>%
select(!matches("VOLUME")) %>%
group_by(RID)
colnames(tau.df) <- str_replace_all(colnames(tau.df), "_SUVR", "")
As shown in Data Understanding, the ROIs are not precisely standardized to the inferior cerebellum gray matter SUVR. I will re-standardize the ROI SUVR values here:
tau.stand <- tau.df
for (i in 4:ncol(tau.stand)) {
tau.stand[i] <- tau.stand[i]/ tau.df[4]
}
rm(tau.df)
The next step is to select brain regions a priori. I am going to stratify the cortical parcellations and subcortical segmentations based on Schöll et al. (2016) and per UCSF’s recommendations for usage of their tau-PET data. Here is the stratification across the Braak stages:
roi.braak <- read.csv("roi_braak_stages.csv") %>% mutate(ROI_Name = tolower(ROI_Name)) %>%
mutate(Hemisphere = ifelse(str_detect(ROI_Name, "rh_|right"), "Right", "Left"))
datatable(roi.braak)
I will filter the tau-PET dataset to only include SUVR data for ROIs detailed in the above list, by first reshaping the tau-PET SUVR data from wide to long. Then, I will merge left and right hemisphere ROIs into one bilateral ROI by taking the mean SUVR.
tau.stand.roi <- tau.stand %>%
pivot_longer(., cols=c(-RID, -VISCODE2, -EXAMDATE), names_to="ROI_Name", values_to="SUVR") %>%
mutate(ROI_Name=tolower(ROI_Name)) %>%
semi_join(., roi.braak) %>%
left_join(., roi.braak) %>%
mutate(ROI_Name = str_replace_all(ROI_Name, "right_|left_|ctx_rh_|ctx_lh_", "")) %>%
group_by(RID, VISCODE2, EXAMDATE, ROI_Name, Braak) %>%
summarise(SUVR = mean(SUVR, na.rm=T))
Now, I will re-shape the data back to wide to be compatible with the cognitive status data shape.
tau.stand.roi <- tau.stand.roi %>%
select(-Braak) %>%
pivot_wider(id_cols=c(RID, VISCODE2, EXAMDATE), names_from="ROI_Name",
values_from="SUVR")
str(tau.stand.roi)
## tibble [1,120 x 28] (S3: grouped_df/tbl_df/tbl/data.frame)
## $ RID : int [1:1120] 21 31 31 56 56 56 59 69 69 69 ...
## $ VISCODE2 : Factor w/ 16 levels "","bl","m108",..: 7 7 8 7 8 9 7 7 8 9 ...
## $ EXAMDATE : Date[1:1120], format: "2018-02-02" "2018-04-24" "2019-04-23" "2018-02-20" ...
## $ bankssts : num [1:1120] 1.34 1.26 1.23 1.32 1.32 ...
## $ caudalanteriorcingulate : num [1:1120] 1.29 1.32 1.25 1.35 1.4 ...
## $ cuneus : num [1:1120] 1.81 1.67 1.6 1.62 1.65 ...
## $ entorhinal : num [1:1120] 1.54 2.28 2.22 2.07 1.95 ...
## $ fusiform : num [1:1120] 1.5 1.38 1.35 1.42 1.4 ...
## $ hippocampus : num [1:1120] 1.56 1.52 1.59 1.29 1.36 ...
## $ inferiortemporal : num [1:1120] 1.67 1.48 1.45 1.61 1.61 ...
## $ insula : num [1:1120] 1.14 1.22 1.18 1.21 1.18 ...
## $ isthmuscingulate : num [1:1120] 1.49 1.38 1.38 1.42 1.51 ...
## $ lateraloccipital : num [1:1120] 1.77 1.49 1.46 1.47 1.58 ...
## $ lingual : num [1:1120] 1.66 1.52 1.45 1.42 1.43 ...
## $ paracentral : num [1:1120] 1.44 1.35 1.3 1.48 1.48 ...
## $ parahippocampal : num [1:1120] 1.18 1.43 1.45 1.39 1.37 ...
## $ pericalcarine : num [1:1120] 1.61 1.33 1.29 1.23 1.27 ...
## $ postcentral : num [1:1120] 1.49 1.4 1.34 1.43 1.42 ...
## $ posteriorcingulate : num [1:1120] 1.37 1.39 1.33 1.33 1.38 ...
## $ precentral : num [1:1120] 1.37 1.4 1.33 1.3 1.34 ...
## $ precuneus : num [1:1120] 1.43 1.4 1.39 1.43 1.48 ...
## $ rostralanteriorcingulate: num [1:1120] 1.32 1.21 1.15 1.21 1.2 ...
## $ superiorfrontal : num [1:1120] 1.41 1.42 1.34 1.43 1.43 ...
## $ superiorparietal : num [1:1120] 1.39 1.5 1.46 1.49 1.52 ...
## $ superiortemporal : num [1:1120] 1.53 1.45 1.38 1.46 1.44 ...
## $ supramarginal : num [1:1120] 1.41 1.37 1.32 1.41 1.45 ...
## $ temporalpole : num [1:1120] 1.71 1.64 1.54 1.54 1.43 ...
## $ transversetemporal : num [1:1120] 1.41 1.46 1.37 1.28 1.21 ...
## - attr(*, "groups")= tibble [1,120 x 4] (S3: tbl_df/tbl/data.frame)
## ..$ RID : int [1:1120] 21 31 31 56 56 56 59 69 69 69 ...
## ..$ VISCODE2: Factor w/ 16 levels "","bl","m108",..: 7 7 8 7 8 9 7 7 8 9 ...
## ..$ EXAMDATE: Date[1:1120], format: "2018-02-02" "2018-04-24" "2019-04-23" ...
## ..$ .rows : list<int> [1:1120]
## .. ..$ : int 1
## .. ..$ : int 2
## .. ..$ : int 3
## .. ..$ : int 4
## .. ..$ : int 5
## .. ..$ : int 6
## .. ..$ : int 7
## .. ..$ : int 8
## .. ..$ : int 9
## .. ..$ : int 10
## .. ..$ : int 11
## .. ..$ : int 12
## .. ..$ : int 13
## .. ..$ : int 14
## .. ..$ : int 15
## .. ..$ : int 16
## .. ..$ : int 17
## .. ..$ : int 18
## .. ..$ : int 19
## .. ..$ : int 20
## .. ..$ : int 21
## .. ..$ : int 22
## .. ..$ : int 23
## .. ..$ : int 24
## .. ..$ : int 25
## .. ..$ : int 26
## .. ..$ : int 27
## .. ..$ : int 28
## .. ..$ : int 29
## .. ..$ : int 30
## .. ..$ : int 31
## .. ..$ : int 32
## .. ..$ : int 33
## .. ..$ : int 34
## .. ..$ : int 35
## .. ..$ : int 36
## .. ..$ : int 37
## .. ..$ : int 38
## .. ..$ : int 39
## .. ..$ : int 40
## .. ..$ : int 41
## .. ..$ : int 42
## .. ..$ : int 43
## .. ..$ : int 44
## .. ..$ : int 45
## .. ..$ : int 46
## .. ..$ : int 47
## .. ..$ : int 48
## .. ..$ : int 49
## .. ..$ : int 50
## .. ..$ : int 51
## .. ..$ : int 52
## .. ..$ : int 53
## .. ..$ : int 54
## .. ..$ : int 55
## .. ..$ : int 56
## .. ..$ : int 57
## .. ..$ : int 58
## .. ..$ : int 59
## .. ..$ : int 60
## .. ..$ : int 61
## .. ..$ : int 62
## .. ..$ : int 63
## .. ..$ : int 64
## .. ..$ : int 65
## .. ..$ : int 66
## .. ..$ : int 67
## .. ..$ : int 68
## .. ..$ : int 69
## .. ..$ : int 70
## .. ..$ : int 71
## .. ..$ : int 72
## .. ..$ : int 73
## .. ..$ : int 74
## .. ..$ : int 75
## .. ..$ : int 76
## .. ..$ : int 77
## .. ..$ : int 78
## .. ..$ : int 79
## .. ..$ : int 80
## .. ..$ : int 81
## .. ..$ : int 82
## .. ..$ : int 83
## .. ..$ : int 84
## .. ..$ : int 85
## .. ..$ : int 86
## .. ..$ : int 87
## .. ..$ : int 88
## .. ..$ : int 89
## .. ..$ : int 90
## .. ..$ : int 91
## .. ..$ : int 92
## .. ..$ : int 93
## .. ..$ : int 94
## .. ..$ : int 95
## .. ..$ : int 96
## .. ..$ : int 97
## .. ..$ : int 98
## .. ..$ : int 99
## .. .. [list output truncated]
## .. ..@ ptype: int(0)
## ..- attr(*, ".drop")= logi TRUE
ADNI compiled a merged dataset containing key information from several tables, including subject demographics, selected cognitive assessment scores, and select biomarker data.
subj.info <- read.csv("../../ADNI_Data/Raw_Data/ADNIMERGE.csv", stringsAsFactors = T, na.strings="")
Filter subject demographics data to contain only the following features of interest:
RID: Participant roster ID, which serves as unique subject identifierVISCODE: Visit codeEXAMDATE: DateAGE: Age at visitPTGENDER: Biological sexCDRSB: CDR Sum-of-Boxes score at visitDX: Current cognitive diagnosissubj.info <- subj.info %>% select(RID, VISCODE, AGE, PTGENDER, CDRSB, DX)
I actually can’t join the two datasets on the EXAMDATE feature, as these sometimes differ by one or two days depending on when the records were entered. Instead, I will join by the RID subject identifier and VISCODE, a visit code identifier.
full.df <- inner_join(tau.stand.roi, subj.info, by=c("RID", "VISCODE2"="VISCODE")) %>%
filter(!is.na(CDRSB)) %>%
group_by(RID) %>%
mutate(n_visits = n()) %>%
filter(n_visits>1) %>%
select(-n_visits)
Click to see the structure of this merged dataset:
str(full.df)
## tibble [576 x 32] (S3: grouped_df/tbl_df/tbl/data.frame)
## $ RID : int [1:576] 31 31 56 56 56 69 69 69 96 96 ...
## $ VISCODE2 : Factor w/ 30 levels "","bl","m108",..: 7 8 7 8 9 7 8 9 5 6 ...
## $ EXAMDATE : Date[1:576], format: "2018-04-24" "2019-04-23" "2018-02-20" "2019-01-10" ...
## $ bankssts : num [1:576] 1.26 1.23 1.32 1.32 1.3 ...
## $ caudalanteriorcingulate : num [1:576] 1.32 1.25 1.35 1.4 1.21 ...
## $ cuneus : num [1:576] 1.67 1.6 1.62 1.65 1.56 ...
## $ entorhinal : num [1:576] 2.28 2.22 2.07 1.95 1.98 ...
## $ fusiform : num [1:576] 1.38 1.35 1.42 1.4 1.34 ...
## $ hippocampus : num [1:576] 1.52 1.59 1.29 1.36 1.34 ...
## $ inferiortemporal : num [1:576] 1.48 1.45 1.61 1.61 1.58 ...
## $ insula : num [1:576] 1.22 1.18 1.21 1.18 1.08 ...
## $ isthmuscingulate : num [1:576] 1.38 1.38 1.42 1.51 1.36 ...
## $ lateraloccipital : num [1:576] 1.49 1.46 1.47 1.58 1.59 ...
## $ lingual : num [1:576] 1.52 1.45 1.42 1.43 1.38 ...
## $ paracentral : num [1:576] 1.35 1.3 1.48 1.48 1.41 ...
## $ parahippocampal : num [1:576] 1.43 1.45 1.39 1.37 1.29 ...
## $ pericalcarine : num [1:576] 1.33 1.29 1.23 1.27 1.22 ...
## $ postcentral : num [1:576] 1.4 1.34 1.43 1.42 1.34 ...
## $ posteriorcingulate : num [1:576] 1.39 1.33 1.33 1.38 1.25 ...
## $ precentral : num [1:576] 1.4 1.33 1.3 1.34 1.25 ...
## $ precuneus : num [1:576] 1.4 1.39 1.43 1.48 1.37 ...
## $ rostralanteriorcingulate: num [1:576] 1.21 1.15 1.21 1.2 1.06 ...
## $ superiorfrontal : num [1:576] 1.42 1.34 1.43 1.43 1.35 ...
## $ superiorparietal : num [1:576] 1.5 1.46 1.49 1.52 1.43 ...
## $ superiortemporal : num [1:576] 1.45 1.38 1.46 1.44 1.37 ...
## $ supramarginal : num [1:576] 1.37 1.32 1.41 1.45 1.37 ...
## $ temporalpole : num [1:576] 1.64 1.54 1.54 1.43 1.47 ...
## $ transversetemporal : num [1:576] 1.46 1.37 1.28 1.21 1.06 ...
## $ AGE : num [1:576] 77.7 77.7 69.6 69.6 69.6 72.9 72.9 72.9 79.6 79.6 ...
## $ PTGENDER : Factor w/ 2 levels "Female","Male": 1 1 1 1 1 2 2 2 2 2 ...
## $ CDRSB : num [1:576] 0 0 0.5 0.5 0 0.5 0 0.5 0 0 ...
## $ DX : Factor w/ 3 levels "CN","Dementia",..: 1 1 3 3 3 3 3 3 1 1 ...
## - attr(*, "groups")= tibble [243 x 2] (S3: tbl_df/tbl/data.frame)
## ..$ RID : int [1:243] 31 56 69 96 112 377 416 467 618 668 ...
## ..$ .rows: list<int> [1:243]
## .. ..$ : int [1:2] 1 2
## .. ..$ : int [1:3] 3 4 5
## .. ..$ : int [1:3] 6 7 8
## .. ..$ : int [1:3] 9 10 11
## .. ..$ : int [1:3] 12 13 14
## .. ..$ : int [1:3] 15 16 17
## .. ..$ : int [1:2] 18 19
## .. ..$ : int [1:2] 20 21
## .. ..$ : int [1:2] 22 23
## .. ..$ : int [1:2] 24 25
## .. ..$ : int [1:2] 26 27
## .. ..$ : int [1:2] 28 29
## .. ..$ : int [1:3] 30 31 32
## .. ..$ : int [1:2] 33 34
## .. ..$ : int [1:3] 35 36 37
## .. ..$ : int [1:2] 38 39
## .. ..$ : int [1:4] 40 41 42 43
## .. ..$ : int [1:2] 44 45
## .. ..$ : int [1:3] 46 47 48
## .. ..$ : int [1:3] 49 50 51
## .. ..$ : int [1:3] 52 53 54
## .. ..$ : int [1:2] 55 56
## .. ..$ : int [1:2] 57 58
## .. ..$ : int [1:4] 59 60 61 62
## .. ..$ : int [1:3] 63 64 65
## .. ..$ : int [1:2] 66 67
## .. ..$ : int [1:4] 68 69 70 71
## .. ..$ : int [1:2] 72 73
## .. ..$ : int [1:4] 74 75 76 77
## .. ..$ : int [1:3] 78 79 80
## .. ..$ : int [1:2] 81 82
## .. ..$ : int [1:2] 83 84
## .. ..$ : int [1:3] 85 86 87
## .. ..$ : int [1:2] 88 89
## .. ..$ : int [1:2] 90 91
## .. ..$ : int [1:3] 92 93 94
## .. ..$ : int [1:2] 95 96
## .. ..$ : int [1:4] 97 98 99 100
## .. ..$ : int [1:2] 101 102
## .. ..$ : int [1:2] 103 104
## .. ..$ : int [1:2] 105 106
## .. ..$ : int [1:2] 107 108
## .. ..$ : int [1:3] 109 110 111
## .. ..$ : int [1:3] 112 113 114
## .. ..$ : int [1:4] 115 116 117 118
## .. ..$ : int [1:2] 119 120
## .. ..$ : int [1:2] 121 122
## .. ..$ : int [1:2] 123 124
## .. ..$ : int [1:2] 125 126
## .. ..$ : int [1:3] 127 128 129
## .. ..$ : int [1:3] 130 131 132
## .. ..$ : int [1:2] 133 134
## .. ..$ : int [1:2] 135 136
## .. ..$ : int [1:2] 137 138
## .. ..$ : int [1:2] 139 140
## .. ..$ : int [1:2] 141 142
## .. ..$ : int [1:3] 143 144 145
## .. ..$ : int [1:2] 146 147
## .. ..$ : int [1:2] 148 149
## .. ..$ : int [1:3] 150 151 152
## .. ..$ : int [1:4] 153 154 155 156
## .. ..$ : int [1:2] 157 158
## .. ..$ : int [1:2] 159 160
## .. ..$ : int [1:2] 161 162
## .. ..$ : int [1:3] 163 164 165
## .. ..$ : int [1:2] 166 167
## .. ..$ : int [1:3] 168 169 170
## .. ..$ : int [1:2] 171 172
## .. ..$ : int [1:2] 173 174
## .. ..$ : int [1:3] 175 176 177
## .. ..$ : int [1:2] 178 179
## .. ..$ : int [1:2] 180 181
## .. ..$ : int [1:3] 182 183 184
## .. ..$ : int [1:2] 185 186
## .. ..$ : int [1:3] 187 188 189
## .. ..$ : int [1:3] 190 191 192
## .. ..$ : int [1:2] 193 194
## .. ..$ : int [1:3] 195 196 197
## .. ..$ : int [1:4] 198 199 200 201
## .. ..$ : int [1:2] 202 203
## .. ..$ : int [1:2] 204 205
## .. ..$ : int [1:2] 206 207
## .. ..$ : int [1:2] 208 209
## .. ..$ : int [1:2] 210 211
## .. ..$ : int [1:2] 212 213
## .. ..$ : int [1:2] 214 215
## .. ..$ : int [1:2] 216 217
## .. ..$ : int [1:2] 218 219
## .. ..$ : int [1:4] 220 221 222 223
## .. ..$ : int [1:2] 224 225
## .. ..$ : int [1:4] 226 227 228 229
## .. ..$ : int [1:2] 230 231
## .. ..$ : int [1:2] 232 233
## .. ..$ : int [1:3] 234 235 236
## .. ..$ : int [1:3] 237 238 239
## .. ..$ : int [1:2] 240 241
## .. ..$ : int [1:2] 242 243
## .. ..$ : int [1:4] 244 245 246 247
## .. ..$ : int [1:3] 248 249 250
## .. .. [list output truncated]
## .. ..@ ptype: int(0)
## ..- attr(*, ".drop")= logi TRUE
cat("\nNumber of longitudinal tau-PET scans with accompanying cognitive data: **\n",
nrow(full.df), "**\nNumber of subjects in merged dataset: **",
length(unique(full.df$RID)), "**\n", "\n", sep="")
Number of longitudinal tau-PET scans with accompanying cognitive data: 576 Number of subjects in merged dataset: 243 As it turns out, only 588 of the original 593 tau-PET scans had corresponding cognitive assessments. This leaves 576 unique PET scan datapoints for 243 subjects.
Lastly, before I can perform outlier detection, I need to derive the longitudinal features upon which the prediction models will be built – namely, annual change in tau-PET SUVR and annual change in CDR-Sum of Boxes score.
annual.changes <- full.df %>%
ungroup() %>%
select(-AGE, -PTGENDER, -DX, -VISCODE2) %>%
pivot_longer(cols=c(-RID, -EXAMDATE), names_to="Metric",
values_to="Value") %>%
group_by(RID, Metric) %>%
summarise(n_years = as.numeric((EXAMDATE - lag(EXAMDATE,
default=EXAMDATE[1]))/365),
change = Value - lag(Value, default=Value[1])) %>%
filter(n_years > 0) %>%
mutate(Annual_Change = change/n_years) %>%
select(-n_years, -change) %>%
group_by(RID, Metric) %>%
mutate(interval_num = row_number()) %>%
pivot_wider(., id_cols=c(RID, interval_num), names_from=Metric,
values_from=Annual_Change)
datatable(annual.changes[1:5])
Now that the datasets are merged, I can perform outlier detection. Given the multivariate nature of this dataset (i.e. multiple brain regions), I will use Cook’s Distance to estimate the relative influence of each data point in a simple multiple regression model.
cooks.distance <- cooks.distance(lm(CDRSB ~ . - RID - interval_num, data=annual.changes))
plot(cooks.distance, pch=1, cex=1)
abline(h = 4*mean(cooks.distance, na.rm = T), col = "blue")
text(x = 1:length(cooks.distance)+2,
y=cooks.distance,
labels=ifelse(cooks.distance > 4 * mean(cooks.distance, na.rm=T),
names(cooks.distance),""),
col="blue")
All but one data point have relatively low Cook’s distance values, while data point #224 has a relatively large Cook’s distance. This suggests large residuals and leverage associated with this datapoint, which could distort model fitting and accuracy. Upon further examination of this instance:
as.data.frame(t(annual.changes[224,])) %>%
rownames_to_column(var="Variable") %>%
rename("Value" = "V1") %>%
kable(., booktabs=T) %>%
kable_styling(full_width = F)
| Variable | Value |
|---|---|
| RID | 6039.0000000 |
| interval_num | 2.0000000 |
| bankssts | -0.2439197 |
| caudalanteriorcingulate | -1.3044195 |
| CDRSB | -0.6564748 |
| cuneus | -3.8446557 |
| entorhinal | -1.0297746 |
| fusiform | -4.3524547 |
| hippocampus | -0.6379813 |
| inferiortemporal | -3.3420989 |
| insula | -0.1532815 |
| isthmuscingulate | -4.2801964 |
| lateraloccipital | -7.1132861 |
| lingual | -4.6834738 |
| paracentral | 1.8873874 |
| parahippocampal | -0.8278863 |
| pericalcarine | -1.2536324 |
| postcentral | 1.8356621 |
| posteriorcingulate | -3.2157580 |
| precentral | -1.0355903 |
| precuneus | -1.8532021 |
| rostralanteriorcingulate | 0.1560680 |
| superiorfrontal | -0.6848381 |
| superiorparietal | -1.6307715 |
| superiortemporal | 1.3453180 |
| supramarginal | -1.0635341 |
| temporalpole | 0.1966368 |
| transversetemporal | 2.0079720 |
This subject exhibits very large fluctuations in tau-PET SUVR values in several brain regions for this associated time interval. Given that SUVR values typically range from 0.75-2, changes of this large magnitude is surprising, and may certainly render this data point an outlier. Fortunately, the interval_num of 2 indicates that this is the second time interval for this subject, so omitting this interval doesn’t reduce the total number of subjects in the analysis. I will remove this data point:
annual.changes <- annual.changes[-224,]
I can now finish some aspects of data exploration that depended upon refining the subject cohort as well as the features. For starters, I will visualize the correlation in annual tau change between each of the ROIs measured:
annual.roi <- annual.changes %>% ungroup() %>% select(-RID, -interval_num, -CDRSB)
roi.cor <- cor(annual.roi)
p.mat <- cor_pmat(annual.roi)
ggcorrplot(roi.cor, hc.order = TRUE,
outline.col = "white", p.mat = p.mat,
insig = "blank") %>% ggplotly() %>%
layout(autosize = F, width = 800, height = 800)